package DataCrawler;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;

// import DataProcessing.Visualizer;
import tech.tablesaw.api.StringColumn;
import tech.tablesaw.api.Table;
import us.codecraft.webmagic.Page;

// 可以爬取一个cid视频的弹幕
public class CommentCrawler extends CidCrawler {
    public String xmlText;
    @Override
    public void process(Page page) {
        xmlText = page.getRawText();

        try {
            FileWriter writer = new FileWriter(resDir + output + ".xml", Charset.forName("utf8"));
            writer.write(xmlText);
            writer.close();
        } catch (IOException e) {
            System.err.println(e.getMessage());
        }
    }

    public List<String> getComments() {
        return getComments(xmlText);
    }

    public static List<String> getComments(String xml) {
        ArrayList<String> ans = new ArrayList<>();
        Pattern pt = Pattern.compile("<d p=.+?>.+?</d>");
        Pattern pt2 = Pattern.compile(">.+<");
        Matcher mch = pt.matcher(xml);
        // mch.group(0);
        while (mch.find()) {
            String line = mch.group();
            Matcher m2 = pt2.matcher(line);
            if(m2.find())
                ans.add(line.substring(m2.start()+1, m2.end()-1));
        }
        return ans;
    }
    
    public static void main(String[] args) {
        var c = new CommentCrawler();
        // c
        // .setUrl("https://comment.bilibili.com/558690501.xml")
        // .setPageOutputFile("Comments")
        // .setResources("src\\main\\resources\\")
        //         .run();
        try{
            c.xmlText = FileUtils.readFileToString(new File("D:\\Documents\\Code\\Java\\javawork\\src\\main\\resources\\Comments.xml"), Charset.forName("utf8"));
        }
        catch (IOException e) {
            System.err.println(e.getMessage());
        }
        
        var col = StringColumn.create("弹幕", c.getComments());
        Table tb = Table.create().addColumns(col);
        
        // tb.write().csv(c.resDir + c.output + ".csv");
        System.out.println(tb.print());
        // Visualizer v = new Visualizer(c.getComments());
        // v.getWordCloud("src/main/resources/wordcloud.png");
    }
}
